Question 2: Compare the classification performance of linear regression and k– nearest neighbor classification on the zipcode data.In particular, consider only the 2’s and 3’s, and k = 1, 3, 5, 7 and 15. Show both the training and test error for each choice. The zipcode data are available from the book website www-stat.stanford.edu/ElemStatLearn.

Read training dataset

zip <- as.matrix(read.table("/Users/pawanjeetkaur/Downloads/IDS-575- MachineLearning Core/Assignment_2/zip.train"))
zip_test <- as.matrix(read.table("/Users/pawanjeetkaur/Downloads/IDS-575- MachineLearning Core/Assignment_2/zip.test"))

#Pick data for 2 and 3's 
data_2_3_digits <- which(zip[, 1] == 2 | zip[, 1] == 3)
nrow(as.matrix(data_2_3_digits))
## [1] 1389
#Pick data for 2 and 3's 
data_2_3_digits_test <- which(zip_test[, 1] == 2 | zip_test[, 1] == 3)
nrow(as.matrix(data_2_3_digits_test))
## [1] 364
dim(zip)
## [1] 7291  257
summary(zip[,1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   4.000   3.903   7.000   9.000

Divide training dataset in independent and dependent variables for knn

data_full_train <- data.frame(zip[data_2_3_digits,])

zip_X_train <- data_full_train[,-1]
zip_Y_train <- data_full_train[,1]

data_full_test <- data.frame(zip_test[data_2_3_digits_test,])

zip_X_test <- data_full_test[,-1]
zip_Y_test <- data_full_test[,1]

Run multiple regression

lm_reg <- lm(V1 ~  . , data = data_full_train)
#summary(lm_reg)

Run prediction on training data

predict_output_train = sapply(predict(lm_reg , newdata = data_full_train), round)

conf_matrix_train <- table(predict_output_train, zip_Y_train)

# Accuracy of model test data
accuracy_train = sum(diag(conf_matrix_train))/ sum(conf_matrix_train)
accuracy_train
## [1] 0.9942405
error_train = 1-accuracy_train
error_train
## [1] 0.005759539

Run prediction on test data

predict_output_test = sapply(predict(lm_reg , newdata = data_full_test ), round)
conf_matrix_test <- table(predict_output_test, zip_Y_test)

# Accuracy of model test data
accuracy_test = sum(diag(conf_matrix_test))/ sum(conf_matrix_test)
accuracy_test
## [1] 0.9587912
error_test = 1-accuracy_test
error_test
## [1] 0.04120879

knn for k = 1

library(class)

knn_1 <- knn(zip_X_train, zip_X_test , cl = zip_Y_train , k=1, use.all = TRUE)

conf_matrix_knn_1_test <- table(knn_1, zip_Y_test)
accuracy_knn_1_test <- sum(diag(conf_matrix_knn_1_test))/ sum(conf_matrix_knn_1_test)
accuracy_knn_1_test
## [1] 0.9752747
error_knn_1_test <- 1 - accuracy_knn_1_test
error_knn_1_test
## [1] 0.02472527
knn_1_train <- knn(zip_X_train, zip_X_train , cl = zip_Y_train , k=1, use.all = TRUE)

conf_matrix_knn_1_train <- table(knn_1_train, zip_Y_train)
accuracy_knn_train_1 <- sum(diag(conf_matrix_knn_1_train))/ sum(conf_matrix_knn_1_train)
accuracy_knn_train_1
## [1] 1
error_knn_1_train <- 1 - accuracy_knn_train_1
error_knn_1_train
## [1] 0

knn for class 3

knn_3 <- knn(zip_X_train, zip_X_test , cl = zip_Y_train , k=3, use.all = TRUE)

conf_matrix_knn_3_test <- table(knn_3, zip_Y_test)
accuracy_knn_3_test <- sum(diag(conf_matrix_knn_3_test))/ sum(conf_matrix_knn_3_test)
accuracy_knn_3_test
## [1] 0.9697802
error_knn_3_test <- 1 - accuracy_knn_3_test
error_knn_3_test
## [1] 0.03021978
knn_3_train <- knn(zip_X_train, zip_X_train , cl = zip_Y_train , k=3, use.all = TRUE)

conf_matrix_knn_3_train <- table(knn_3_train, zip_Y_train)
accuracy_knn_train_3 <- sum(diag(conf_matrix_knn_3_train))/ sum(conf_matrix_knn_3_train)
accuracy_knn_train_3
## [1] 0.9949604
error_knn_3_train <- 1 - accuracy_knn_train_3
error_knn_3_train
## [1] 0.005039597

knn for class 5

knn_5 <- knn(zip_X_train, zip_X_test , cl = zip_Y_train , k=5, use.all = TRUE)

conf_matrix_knn_5_test <- table(knn_5, zip_Y_test)
accuracy_knn_5_test <- sum(diag(conf_matrix_knn_5_test))/ sum(conf_matrix_knn_5_test)
accuracy_knn_5_test
## [1] 0.9697802
error_knn_5_test <- 1 - accuracy_knn_5_test
error_knn_5_test
## [1] 0.03021978
knn_5_train <- knn(zip_X_train, zip_X_train , cl = zip_Y_train , k=5, use.all = TRUE)

conf_matrix_knn_5_train <- table(knn_5_train, zip_Y_train)
accuracy_knn_train_5 <- sum(diag(conf_matrix_knn_5_train))/ sum(conf_matrix_knn_5_train)
accuracy_knn_train_5
## [1] 0.9942405
error_knn_5_train <- 1 - accuracy_knn_train_5
error_knn_5_train
## [1] 0.005759539

knn for class 7

knn_7 <- knn(zip_X_train, zip_X_test , cl = zip_Y_train , k=7, use.all = TRUE)

conf_matrix_knn_7_test <- table(knn_7, zip_Y_test)
accuracy_knn_7_test <- sum(diag(conf_matrix_knn_7_test))/ sum(conf_matrix_knn_7_test)
accuracy_knn_7_test
## [1] 0.967033
error_knn_7_test <- 1 - accuracy_knn_7_test
error_knn_7_test
## [1] 0.03296703
knn_7_train <- knn(zip_X_train, zip_X_train , cl = zip_Y_train , k=7, use.all = TRUE)

conf_matrix_knn_7_train <- table(knn_7_train, zip_Y_train)
accuracy_knn_train_7 <- sum(diag(conf_matrix_knn_7_train))/ sum(conf_matrix_knn_7_train)
accuracy_knn_train_7
## [1] 0.9935205
error_knn_7_train <- 1 - accuracy_knn_train_7
error_knn_7_train
## [1] 0.006479482

knn for class 15

knn_15 <- knn(zip_X_train, zip_X_test , cl = zip_Y_train , k=15, use.all = TRUE)

conf_matrix_knn_15_test <- table(knn_15, zip_Y_test)
accuracy_knn_15_test <- sum(diag(conf_matrix_knn_15_test))/ sum(conf_matrix_knn_15_test)
accuracy_knn_15_test
## [1] 0.9615385
error_knn_15_test <- 1 - accuracy_knn_15_test
error_knn_15_test
## [1] 0.03846154
knn_15_train <- knn(zip_X_train, zip_X_train , cl = zip_Y_train , k=15, use.all = TRUE)

conf_matrix_knn_15_train <- table(knn_15_train, zip_Y_train)
accuracy_knn_train_15 <- sum(diag(conf_matrix_knn_15_train))/ sum(conf_matrix_knn_15_train)
accuracy_knn_train_15
## [1] 0.9906407
error_knn_15_train <- 1 - accuracy_knn_train_15
error_knn_15_train
## [1] 0.009359251

Plot Errors

x_axis = c(1,3,5,7,15)
y_axis_train = c(error_knn_1_train ,error_knn_3_train, error_knn_5_train,error_knn_7_train,error_knn_15_train)
y_axis_test = c(error_knn_1_test ,error_knn_3_test, error_knn_5_test,error_knn_7_test,error_knn_15_test)

plot_data = data.frame(x_axis, y_axis_test , y_axis_train)
library(ggplot2)

theme = theme(panel.grid=element_blank(),panel.background = element_blank(),
              axis.line = element_line(colour = "black"))

ggplot(plot_data , aes(x = x_axis, y = y_axis_test)) + geom_line(aes(col = "test_knn")) + xlim(c(0,16)) + ylim(c(0,0.05)) +
  geom_line(y = y_axis_train, aes(col = "train_knn")) + 
  geom_hline(size=1.5,linetype='dashed',aes(yintercept = error_test, col = "lm_test")) + 
  geom_hline(aes(col = "lm_train", yintercept = error_train),size=1.5,linetype='dashed',) + 
  scale_color_manual(name ="Labels", values = c(lm_test = "aquamarine4", lm_train = "aquamarine2", test_knn = "purple" , train_knn = "brown")) +
  ggtitle("Error rate for linear regression vs knn") +
  theme + xlab("K values") + ylab("Error rates") 

Question 9: This question involves the use of multiple linear regression on the Auto data set.

Read dataset

library(ISLR)
data <- data.frame(Auto)

dim(data)
## [1] 392   9
str(data)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...

(a) Produce a scatterplot matrix which includes all of the variables in the data set.

pairs(data, col = "aquamarine4", upper.panel = NULL, main = "Scatterplot for Auto Dataset")

(b) Compute the matrix of correlations between the variables using the function cor(). You will need to exclude the name variable, which is qualitative.

cor(data[, -9])
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
##              acceleration       year     origin
## mpg             0.4233285  0.5805410  0.5652088
## cylinders      -0.5046834 -0.3456474 -0.5689316
## displacement   -0.5438005 -0.3698552 -0.6145351
## horsepower     -0.6891955 -0.4163615 -0.4551715
## weight         -0.4168392 -0.3091199 -0.5850054
## acceleration    1.0000000  0.2903161  0.2127458
## year            0.2903161  1.0000000  0.1815277
## origin          0.2127458  0.1815277  1.0000000

(c) Use the lm() function to perform a multiple linear regression with mpg as the response and all other variables except name as the predictors.

lm_fit <- lm(mpg ~ ., data = data[,-9])
summary(lm_fit)
## 
## Call:
## lm(formula = mpg ~ ., data = data[, -9])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16

(c.i) Is there a relationship between the predictors and the response?

cor(data[, -9])[1,]
##          mpg    cylinders displacement   horsepower       weight 
##    1.0000000   -0.7776175   -0.8051269   -0.7784268   -0.8322442 
## acceleration         year       origin 
##    0.4233285    0.5805410    0.5652088

Yes there is relationship b/w predictors and response. There is negative high corelation between mpg ~ weight ,displacement, cylinder and horsepower.Remaining fields have positive corelation with response variable mpg

(c.ii) Which predictors appear to have a statistically significant relationship to the response?

Predictors displacement, weight, year and origin appear to have statistically significant relationship to response based on the above regression output and p-value

(c.iii) What does the coefficient for the year variable suggest?

Coefficient for the year is 0.75 i.e. with every unit increase increase in year, response variable mpg increases by 75 percent it has positive relationship with the response.

(d) Use the plot() function to produce diagnostic plots of the linear regression fit.Comment on any problems you see with the fit. Do the residual plots suggest any unusually large outliers?Does the leverage plot identify any observations with unusually high leverage?

plot(lm_fit , col = "aquamarine4")

The residual vs Fitted plot displays some non-linearity in the data.The Normal Q-Q plot suggests that the residuals are roughly normal. The scale-location curve indicates the residuals to have a random spread along the range to predictors, hence, we can say that the residuals are roughly homoscedastic. The Residuals vs Leverage curve shows some mild outliers and a high-leverage point (point - 14)

(e) Use the * and : symbols to fit linear regression models with interaction effects.Do any interactions appear to be statistically significant?

lm_fit_int <- lm(mpg ~ horsepower * displacement, data = data[,-9])
summary(lm_fit_int)
## 
## Call:
## lm(formula = mpg ~ horsepower * displacement, data = data[, -9])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.9391  -2.3373  -0.5816   2.1698  17.5771 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              5.305e+01  1.526e+00   34.77   <2e-16 ***
## horsepower              -2.343e-01  1.959e-02  -11.96   <2e-16 ***
## displacement            -9.805e-02  6.682e-03  -14.67   <2e-16 ***
## horsepower:displacement  5.828e-04  5.193e-05   11.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.944 on 388 degrees of freedom
## Multiple R-squared:  0.7466, Adjusted R-squared:  0.7446 
## F-statistic:   381 on 3 and 388 DF,  p-value: < 2.2e-16
lm_fit_int_col <- lm(mpg ~ horsepower:displacement, data = data[,-9])
summary(lm_fit_int_col)
## 
## Call:
## lm(formula = mpg ~ horsepower:displacement, data = data[, -9])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.1917  -3.9460  -0.9919   3.0108  18.2170 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.989e+01  3.901e-01   76.62   <2e-16 ***
## horsepower:displacement -2.694e-04  1.209e-05  -22.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.184 on 390 degrees of freedom
## Multiple R-squared:   0.56,  Adjusted R-squared:  0.5589 
## F-statistic: 496.4 on 1 and 390 DF,  p-value: < 2.2e-16
lm_fit_all <- lm(mpg ~ (.*.), data = data[,-9])
summary(lm_fit_all)
## 
## Call:
## lm(formula = mpg ~ (. * .), data = data[, -9])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6303 -1.4481  0.0596  1.2739 11.1386 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                3.548e+01  5.314e+01   0.668  0.50475   
## cylinders                  6.989e+00  8.248e+00   0.847  0.39738   
## displacement              -4.785e-01  1.894e-01  -2.527  0.01192 * 
## horsepower                 5.034e-01  3.470e-01   1.451  0.14769   
## weight                     4.133e-03  1.759e-02   0.235  0.81442   
## acceleration              -5.859e+00  2.174e+00  -2.696  0.00735 **
## year                       6.974e-01  6.097e-01   1.144  0.25340   
## origin                    -2.090e+01  7.097e+00  -2.944  0.00345 **
## cylinders:displacement    -3.383e-03  6.455e-03  -0.524  0.60051   
## cylinders:horsepower       1.161e-02  2.420e-02   0.480  0.63157   
## cylinders:weight           3.575e-04  8.955e-04   0.399  0.69000   
## cylinders:acceleration     2.779e-01  1.664e-01   1.670  0.09584 . 
## cylinders:year            -1.741e-01  9.714e-02  -1.793  0.07389 . 
## cylinders:origin           4.022e-01  4.926e-01   0.816  0.41482   
## displacement:horsepower   -8.491e-05  2.885e-04  -0.294  0.76867   
## displacement:weight        2.472e-05  1.470e-05   1.682  0.09342 . 
## displacement:acceleration -3.479e-03  3.342e-03  -1.041  0.29853   
## displacement:year          5.934e-03  2.391e-03   2.482  0.01352 * 
## displacement:origin        2.398e-02  1.947e-02   1.232  0.21875   
## horsepower:weight         -1.968e-05  2.924e-05  -0.673  0.50124   
## horsepower:acceleration   -7.213e-03  3.719e-03  -1.939  0.05325 . 
## horsepower:year           -5.838e-03  3.938e-03  -1.482  0.13916   
## horsepower:origin          2.233e-03  2.930e-02   0.076  0.93931   
## weight:acceleration        2.346e-04  2.289e-04   1.025  0.30596   
## weight:year               -2.245e-04  2.127e-04  -1.056  0.29182   
## weight:origin             -5.789e-04  1.591e-03  -0.364  0.71623   
## acceleration:year          5.562e-02  2.558e-02   2.174  0.03033 * 
## acceleration:origin        4.583e-01  1.567e-01   2.926  0.00365 **
## year:origin                1.393e-01  7.399e-02   1.882  0.06062 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared:  0.8893, Adjusted R-squared:  0.8808 
## F-statistic: 104.2 on 28 and 363 DF,  p-value: < 2.2e-16

Few statistically significant interactions are displacement:year , acceleration:year and acceleration:origin

(f) Try a few different transformations of the variables, such as log(X), √X, X2. Comment on your findings

lm_fit_f_part <- lm(mpg~  I(cylinders^2) + sqrt(displacement) + sqrt(horsepower) + log(weight) + 
                   I(acceleration^2)  + year + origin , data = data[, -9])
summary(lm_fit_f_part)
## 
## Call:
## lm(formula = mpg ~ I(cylinders^2) + sqrt(displacement) + sqrt(horsepower) + 
##     log(weight) + I(acceleration^2) + year + origin, data = data[, 
##     -9])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1069 -1.9532 -0.0089  1.7099 12.8016 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        123.571135  12.217980  10.114  < 2e-16 ***
## I(cylinders^2)       0.016429   0.025326   0.649  0.51693    
## sqrt(displacement)   0.199035   0.207529   0.959  0.33813    
## sqrt(horsepower)    -0.328482   0.280941  -1.169  0.24304    
## log(weight)        -20.168541   1.913320 -10.541  < 2e-16 ***
## I(acceleration^2)    0.003127   0.002837   1.102  0.27101    
## year                 0.766080   0.047939  15.980  < 2e-16 ***
## origin               0.952517   0.273740   3.480  0.00056 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.106 on 384 degrees of freedom
## Multiple R-squared:  0.8445, Adjusted R-squared:  0.8416 
## F-statistic: 297.9 on 7 and 384 DF,  p-value: < 2.2e-16

Overall test seems to be stastically significant.Among transformations log of weight provides significant results.

Question 15 Predict per capita crime rate using the other variables in this data set.In other words, per capita crime rate is the response, and the other variables are the predictors

(a) For each predictor, fit a simple linear regression model to predict the response.Describe your results. In which of the models is there a statistically significant association between the predictor and the response? Create some plots to back up your assertions.

library('MASS')

run_lm <- function(x) {
  lm_run <- lm(crim ~ x, data = Boston)
  with(Boston, plot(x , crim, col = "aquamarine4"))
  abline(lm_run)
  print(summary(lm_run)$coefficients)
  summary(lm_run)$coefficients[2,1]
}

df_output <- as.data.frame(colnames(Boston[,-1]))
df_output <- cbind(df_output, sapply(Boston[,-1] , run_lm))

##                Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)  4.45369376  0.4172178 10.674746 4.037668e-24
## x           -0.07393498  0.0160946 -4.593776 5.506472e-06

##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -2.0637426 0.66722830 -3.093008 2.091266e-03
## x            0.5097763 0.05102433  9.990848 1.450349e-21

##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)  3.744447  0.3961111  9.453021 1.239505e-19
## x           -1.892777  1.5061155 -1.256727 2.094345e-01

##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -13.71988   1.699479 -8.072992 5.076814e-15
## x            31.24853   2.999190 10.418989 3.751739e-23

##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 20.481804  3.3644742  6.087669 2.272000e-09
## x           -2.684051  0.5320411 -5.044819 6.346703e-07

##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -3.7779063 0.94398472 -4.002084 7.221718e-05
## x            0.1077862 0.01273644  8.462825 2.854869e-16

##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)  9.499262  0.7303972 13.005611 1.502748e-33
## x           -1.550902  0.1683300 -9.213458 8.519949e-19

##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -2.2871594 0.44347583 -5.157349 3.605846e-07
## x            0.6179109 0.03433182 17.998199 2.693844e-56

##                Estimate  Std. Error   t value     Pr(>|t|)
## (Intercept) -8.52836909 0.815809392 -10.45387 2.773600e-23
## x            0.02974225 0.001847415  16.09939 2.357127e-47

##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -17.646933  3.1472718 -5.607057 3.395255e-08
## x             1.151983  0.1693736  6.801430 2.942922e-11

##                Estimate  Std. Error   t value     Pr(>|t|)
## (Intercept) 16.55352922 1.425902755 11.609157 8.922239e-28
## x           -0.03627964 0.003873154 -9.366951 2.487274e-19

##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -3.3305381 0.69375829 -4.800718 2.087022e-06
## x            0.5488048 0.04776097 11.490654 2.654277e-27

##               Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 11.7965358 0.93418916 12.62757 5.934119e-32
## x           -0.3631599 0.03839017 -9.45971 1.173987e-19

All the predictors except chas are statistically significant

(b) Fit a multiple regression model to predict the response using all of the predictors. Describe your results. For which predictors can we reject the null hypothesis H0 : βj = 0?

lm_multiple <- lm(crim ~ . , data = Boston)
summary(lm_multiple)
## 
## Call:
## lm(formula = crim ~ ., data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.924 -2.120 -0.353  1.019 75.051 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.033228   7.234903   2.354 0.018949 *  
## zn            0.044855   0.018734   2.394 0.017025 *  
## indus        -0.063855   0.083407  -0.766 0.444294    
## chas         -0.749134   1.180147  -0.635 0.525867    
## nox         -10.313535   5.275536  -1.955 0.051152 .  
## rm            0.430131   0.612830   0.702 0.483089    
## age           0.001452   0.017925   0.081 0.935488    
## dis          -0.987176   0.281817  -3.503 0.000502 ***
## rad           0.588209   0.088049   6.680 6.46e-11 ***
## tax          -0.003780   0.005156  -0.733 0.463793    
## ptratio      -0.271081   0.186450  -1.454 0.146611    
## black        -0.007538   0.003673  -2.052 0.040702 *  
## lstat         0.126211   0.075725   1.667 0.096208 .  
## medv         -0.198887   0.060516  -3.287 0.001087 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.439 on 492 degrees of freedom
## Multiple R-squared:  0.454,  Adjusted R-squared:  0.4396 
## F-statistic: 31.47 on 13 and 492 DF,  p-value: < 2.2e-16
multiple_reg_coeff <- summary(lm_multiple)$coefficients[-1,1]

df_out <- cbind(df_output,multiple_reg_coeff)
colnames(df_out) <- c("predictors","single_reg_coeff" , "multiple_reg_coeff")

plot(lm_multiple, col= "aquamarine4")

Based on above regression results, we can reject ho for rad,zn, dis, black and medv

(c) How do your results from (a) compare to your results from (b)?Create a plot displaying the univariate regression coefficients from (a) on the x-axis,and the multiple regression coefficients from (b) on the y-axis. That is, each predictor is displayed as a single point in the plot.Its coefficient in a simple linear regression model is shown on the x-axis,and its coefficient estimate in the multiple linear regression model is shown on the y-axis.

library(ggplot2)

ggplot(data= df_out, aes(x=single_reg_coeff, y=multiple_reg_coeff)) + 
  geom_point(aes(color = predictors)) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black")) + xlab("Univariate coefficients") + ylab("Mutlivariate coefficients") + ggtitle("Univariate VS Multivariate coefficient") 

In univariate chas was not statisitically significant but others were significant whereas in multivariate regression only zn, rad, dis, black and medv are significant

(d) Is there evidence of non-linear association between any of the predictors and the response?To answer this question, for each predictor X, fit a model of the form Y = β0 +β1X +β2X2 +β3X3 +ε.

run_lm <- function(x) {
  lm_run_1 <- lm(crim ~ poly(x,3) , data = Boston)
  summary(lm_run_1)
}

apply(Boston[,-4],2,run_lm)
## Warning in summary.lm(lm_run_1): essentially perfect fit: summary may be
## unreliable
## $crim
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.318e-14 -3.980e-16  4.500e-17  1.270e-16  5.602e-14 
## 
## Coefficients:
##               Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)  3.614e+00  1.444e-16  2.502e+16  < 2e-16 ***
## poly(x, 3)1  1.933e+02  3.248e-15  5.951e+16  < 2e-16 ***
## poly(x, 3)2  1.574e-14  3.248e-15  4.845e+00 1.69e-06 ***
## poly(x, 3)3 -1.546e-14  3.248e-15 -4.758e+00 2.56e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.248e-15 on 502 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.18e+33 on 3 and 502 DF,  p-value: < 2.2e-16
## 
## 
## $zn
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.821 -4.614 -1.294  0.473 84.130 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6135     0.3722   9.709  < 2e-16 ***
## poly(x, 3)1 -38.7498     8.3722  -4.628  4.7e-06 ***
## poly(x, 3)2  23.9398     8.3722   2.859  0.00442 ** 
## poly(x, 3)3 -10.0719     8.3722  -1.203  0.22954    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.372 on 502 degrees of freedom
## Multiple R-squared:  0.05824,    Adjusted R-squared:  0.05261 
## F-statistic: 10.35 on 3 and 502 DF,  p-value: 1.281e-06
## 
## 
## $indus
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.278 -2.514  0.054  0.764 79.713 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.614      0.330  10.950  < 2e-16 ***
## poly(x, 3)1   78.591      7.423  10.587  < 2e-16 ***
## poly(x, 3)2  -24.395      7.423  -3.286  0.00109 ** 
## poly(x, 3)3  -54.130      7.423  -7.292  1.2e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.423 on 502 degrees of freedom
## Multiple R-squared:  0.2597, Adjusted R-squared:  0.2552 
## F-statistic: 58.69 on 3 and 502 DF,  p-value: < 2.2e-16
## 
## 
## $nox
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.110 -2.068 -0.255  0.739 78.302 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6135     0.3216  11.237  < 2e-16 ***
## poly(x, 3)1  81.3720     7.2336  11.249  < 2e-16 ***
## poly(x, 3)2 -28.8286     7.2336  -3.985 7.74e-05 ***
## poly(x, 3)3 -60.3619     7.2336  -8.345 6.96e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.234 on 502 degrees of freedom
## Multiple R-squared:  0.297,  Adjusted R-squared:  0.2928 
## F-statistic: 70.69 on 3 and 502 DF,  p-value: < 2.2e-16
## 
## 
## $rm
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.485  -3.468  -2.221  -0.015  87.219 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6135     0.3703   9.758  < 2e-16 ***
## poly(x, 3)1 -42.3794     8.3297  -5.088 5.13e-07 ***
## poly(x, 3)2  26.5768     8.3297   3.191  0.00151 ** 
## poly(x, 3)3  -5.5103     8.3297  -0.662  0.50858    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.33 on 502 degrees of freedom
## Multiple R-squared:  0.06779,    Adjusted R-squared:  0.06222 
## F-statistic: 12.17 on 3 and 502 DF,  p-value: 1.067e-07
## 
## 
## $age
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.762 -2.673 -0.516  0.019 82.842 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6135     0.3485  10.368  < 2e-16 ***
## poly(x, 3)1  68.1820     7.8397   8.697  < 2e-16 ***
## poly(x, 3)2  37.4845     7.8397   4.781 2.29e-06 ***
## poly(x, 3)3  21.3532     7.8397   2.724  0.00668 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.84 on 502 degrees of freedom
## Multiple R-squared:  0.1742, Adjusted R-squared:  0.1693 
## F-statistic: 35.31 on 3 and 502 DF,  p-value: < 2.2e-16
## 
## 
## $dis
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.757  -2.588   0.031   1.267  76.378 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6135     0.3259  11.087  < 2e-16 ***
## poly(x, 3)1 -73.3886     7.3315 -10.010  < 2e-16 ***
## poly(x, 3)2  56.3730     7.3315   7.689 7.87e-14 ***
## poly(x, 3)3 -42.6219     7.3315  -5.814 1.09e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.331 on 502 degrees of freedom
## Multiple R-squared:  0.2778, Adjusted R-squared:  0.2735 
## F-statistic: 64.37 on 3 and 502 DF,  p-value: < 2.2e-16
## 
## 
## $rad
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.381  -0.412  -0.269   0.179  76.217 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6135     0.2971  12.164  < 2e-16 ***
## poly(x, 3)1 120.9074     6.6824  18.093  < 2e-16 ***
## poly(x, 3)2  17.4923     6.6824   2.618  0.00912 ** 
## poly(x, 3)3   4.6985     6.6824   0.703  0.48231    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.682 on 502 degrees of freedom
## Multiple R-squared:    0.4,  Adjusted R-squared:  0.3965 
## F-statistic: 111.6 on 3 and 502 DF,  p-value: < 2.2e-16
## 
## 
## $tax
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.273  -1.389   0.046   0.536  76.950 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6135     0.3047  11.860  < 2e-16 ***
## poly(x, 3)1 112.6458     6.8537  16.436  < 2e-16 ***
## poly(x, 3)2  32.0873     6.8537   4.682 3.67e-06 ***
## poly(x, 3)3  -7.9968     6.8537  -1.167    0.244    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.854 on 502 degrees of freedom
## Multiple R-squared:  0.3689, Adjusted R-squared:  0.3651 
## F-statistic:  97.8 on 3 and 502 DF,  p-value: < 2.2e-16
## 
## 
## $ptratio
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.833 -4.146 -1.655  1.408 82.697 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.614      0.361  10.008  < 2e-16 ***
## poly(x, 3)1   56.045      8.122   6.901 1.57e-11 ***
## poly(x, 3)2   24.775      8.122   3.050  0.00241 ** 
## poly(x, 3)3  -22.280      8.122  -2.743  0.00630 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.122 on 502 degrees of freedom
## Multiple R-squared:  0.1138, Adjusted R-squared:  0.1085 
## F-statistic: 21.48 on 3 and 502 DF,  p-value: 4.171e-13
## 
## 
## $black
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.096  -2.343  -2.128  -1.439  86.790 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6135     0.3536  10.218   <2e-16 ***
## poly(x, 3)1 -74.4312     7.9546  -9.357   <2e-16 ***
## poly(x, 3)2   5.9264     7.9546   0.745    0.457    
## poly(x, 3)3  -4.8346     7.9546  -0.608    0.544    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.955 on 502 degrees of freedom
## Multiple R-squared:  0.1498, Adjusted R-squared:  0.1448 
## F-statistic: 29.49 on 3 and 502 DF,  p-value: < 2.2e-16
## 
## 
## $lstat
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.234  -2.151  -0.486   0.066  83.353 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6135     0.3392  10.654   <2e-16 ***
## poly(x, 3)1  88.0697     7.6294  11.543   <2e-16 ***
## poly(x, 3)2  15.8882     7.6294   2.082   0.0378 *  
## poly(x, 3)3 -11.5740     7.6294  -1.517   0.1299    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.629 on 502 degrees of freedom
## Multiple R-squared:  0.2179, Adjusted R-squared:  0.2133 
## F-statistic: 46.63 on 3 and 502 DF,  p-value: < 2.2e-16
## 
## 
## $medv
## 
## Call:
## lm(formula = crim ~ poly(x, 3), data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.427  -1.976  -0.437   0.439  73.655 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.614      0.292  12.374  < 2e-16 ***
## poly(x, 3)1  -75.058      6.569 -11.426  < 2e-16 ***
## poly(x, 3)2   88.086      6.569  13.409  < 2e-16 ***
## poly(x, 3)3  -48.033      6.569  -7.312 1.05e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.569 on 502 degrees of freedom
## Multiple R-squared:  0.4202, Adjusted R-squared:  0.4167 
## F-statistic: 121.3 on 3 and 502 DF,  p-value: < 2.2e-16

Based on output factor like zn, rm ,rad, tax,lstat the cubic transformation are not statistically significant but are significant at single and quadratic degree. Black is significant only at single degree Whereas inuds, nox,age,dis,ptratio ,medv are significant at 3 degree of polynomial.